from gxl_ai_utils.utils import utils_file

cn_path="/home/node54_tmpdata/xlgeng/code/wenet_mine/examples/aishell2/s0/data/dict/unit.txt"
en_path = "/home/node54_tmpdata/xlgeng/code/wenet_mine/examples/librispeech/s0/data/lang_char/unigram500_units.txt"
cn_dict = utils_file.load_dict_from_scp(cn_path)
en_dict = utils_file.load_dict_from_scp(en_path)

def is_chinese_char(char):
    # 判断字符是否在简体汉字的 Unicode 范围内
    return '\u4e00' <= char <= '\u9fff'

new_cn_list = []
for key, value in cn_dict.items():
    if len(key)==1 and is_chinese_char(key):
        new_cn_list.append(key)
    continue

en_list = list(en_dict.keys())
en_list.extend(new_cn_list)
new_result_dict = {}
i = 0
for key in en_list:
    new_result_dict[key] = i
    i += 1
output_unit_path = 'data/dict/unit_char_bpe500.txt'
utils_file.write_dict_to_scp(new_result_dict, output_unit_path)
pbe_model_path = "/home/node54_tmpdata/xlgeng/code/wenet_mine/examples/librispeech/s0/data/lang_char/unigram500.model"
output_bpe_model_path = 'data/dict/bpe500.model'
utils_file.copy_file(pbe_model_path, output_bpe_model_path)